%load_ext autoreload
%autoreload 2
from IPython.display import display
import os
import pickle
import numpy as np
import pandas as pd
import statsmodels.api as sm
import arviz as az
import pymc3 as pm
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from modules.models.stats import LMMPerformance
from modules.utils.general_utils.stats_utils import compute_cd_from_regression
def sns_styleset():
sns.set(context='paper', style='ticks', font='DejaVu Sans')
matplotlib.rcParams['figure.dpi'] = 300
matplotlib.rcParams['axes.linewidth'] = 1
matplotlib.rcParams['xtick.major.width'] = 1
matplotlib.rcParams['ytick.major.width'] = 1
matplotlib.rcParams['xtick.major.size'] = 3
matplotlib.rcParams['ytick.major.size'] = 3
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['font.size'] = 13
matplotlib.rcParams['axes.titlesize'] = 13
matplotlib.rcParams['axes.labelsize'] = 13
matplotlib.rcParams['legend.fontsize'] = 13
matplotlib.rcParams['xtick.labelsize'] = 13
matplotlib.rcParams['ytick.labelsize'] = 13
sns_styleset()
final_results = []
for fold_n in range(10):
final_results.append(
pd.read_csv(f'results\\tables\\models_performance\\models_performance_fold_{fold_n}.csv')
)
final_results = pd.concat(final_results, ignore_index=True)
final_results = final_results.dropna()
final_results['time_epochs'] = final_results['fitting_time'] / final_results['epochs']
final_results['Model'] = final_results['Model'].map(
{
'Lag 1': 'Lag 1',
'Median': 'Median',
'TD E-Net': 'ENet',
'TD MLP': 'MLP',
'RNN': 'RNN'
}
)
targets = final_results['target'].unique()
comparisons = [
('Lag 1', 'Median'),
('Median', 'ENet'),
('ENet', 'MLP'),
('MLP','RNN')
]
final_results.to_csv('results\\tables\\models_performance\\models_performance_total.csv')
final_results['value'] = final_results['value'] * 100
collapsed_target = final_results.groupby(['Model', 'Session', 'context', 'fold_n'])['value'].sum().reset_index()
collapsed_target['value'] = collapsed_target['value'] / 5
collapsed_time = final_results.groupby(['Model', 'target', 'fold_n', 'context'])['value'].mean().reset_index()
collapsed_context = final_results.groupby(['Model', 'target', 'fold_n', 'Session'])['value'].mean().reset_index()
with (open('results\\saved_data_containers\\melchior.pkl', 'rb')) as container:
data_container = pickle.load(container)
plt.figure(figsize=(5, 5))
ax = sns.boxplot(
x='Model',
y='value',
data=collapsed_target,
order=['Lag 1', 'Median', 'ENet', 'MLP', 'RNN']
)
plt.ylabel('% Error')
sns.despine()
plt.tight_layout()
plt.show()
box_plots = sns.catplot(
x='context',
y='value',
col='target',
hue='Model',
kind='bar',
data=collapsed_time,
hue_order=['Lag 1', 'Median', 'ENet', 'MLP', 'RNN'],
col_wrap=5,
height=3.5,
sharey=False
)
box_plots.set_titles('Game {col_name}')
box_plots.set_ylabels('% Error')
plt.show()
line_plots = sns.relplot(
x='Session',
y='value',
hue='Model',
col='target',
kind='line',
data=collapsed_context,
col_wrap=5,
height=3.5,
facet_kws={'sharey': False}
)
line_plots.set_titles('{col_name}')
line_plots.set_ylabels('% Error')
plt.show()
line_plots = sns.relplot(
x='Session',
y='value',
hue='Model',
col='target',
row='context',
kind='line',
data=final_results,
height=4,
facet_kws={'sharey': False}
)
line_plots.set_titles('')
for column, target in enumerate([
'Future Absence',
'Future Active Time',
'Future Session Time',
'Future Session Activity',
'Future N° Sessions'
]):
line_plots.axes[0, column].set_title(target)
for row, game in enumerate([
'hms',
'jc3',
'lis',
'lisbf',
'jc4',
'hmg'
]):
line_plots.axes[row, 0].set_ylabel(f'Loss Game {game}')
plt.xlim(1, 20)
plt.show()
vcf = {'Session': '0 + C(Session)', 'context': '0 + C(context)', 'fold_n': '0 + C(fold_n)'}
final_results['group'] = 1
collapsed_target['group'] =1
contrasts = np.array(
[
[0, 0, 1, 0, -1], # lag 1 - median
[0, -1, 0, 0, 1], # enet - median
[0, 1, 0, -1, 0], # enet - mlp
]
)
model = sm.MixedLM.from_formula(
"value ~ C(Model, Treatment(reference='RNN'))",
groups='group',
vc_formula=vcf,
re_formula='0',
data=collapsed_target
)
result = model.fit()
display(result.summary())
residuals_df = pd.DataFrame(
{
'Residuals': result.resid,
'Model': collapsed_target['Model'].values
}
)
n = len(residuals_df[residuals_df['Model'] == 'RNN']['Residuals'])
var_1 = residuals_df[residuals_df['Model'] == 'RNN']['Residuals'].var()
std_effect_sizes = pd.DataFrame(
columns=[
'Model',
'Coeff',
"Cohen's d"
]
)
for index, model in enumerate(["ENet", "Lag 1", "MLP", "Median"], 1):
var_2 = residuals_df[residuals_df['Model'] == model]['Residuals'].var()
beta = result.params[index]
cd = compute_cd_from_regression(
beta=beta,
var_1=var_1,
var_2=var_2,
n_1=n,
n_2=n
)
std_effect_sizes.loc[index - 1] = [
model,
beta,
cd
]
std_effect_sizes
summary_post_hoc = result.t_test(contrasts).summary()
results_as_html = summary_post_hoc.as_html()
summary_post_hoc = pd.read_html(results_as_html, header=0, index_col=0)[0]
summary_post_hoc = summary_post_hoc.reset_index()
summary_post_hoc = summary_post_hoc.rename(
{
'index': 'Contrast'
},
axis=1
)
summary_post_hoc['Contrast'] = summary_post_hoc['Contrast'].map(
{
'c0': 'Lag 1 - Median',
'c1': 'Median - ENet',
'c2': 'ENet - MLP'
}
)
display(summary_post_hoc)
for target in final_results['target'].unique():
print(target)
data = final_results[final_results['target'] == target]
model = sm.MixedLM.from_formula(
"value ~ C(Model, Treatment(reference='RNN') )",
groups='fold_n',
vc_formula=vcf,
re_formula='0',
data=data
)
result = model.fit()
residuals_df = pd.DataFrame(
{
'Residuals': result.resid,
'Model': collapsed_target['Model'].values
}
)
n = len(residuals_df[residuals_df['Model'] == 'RNN']['Residuals'])
var_1 = residuals_df[residuals_df['Model'] == 'RNN']['Residuals'].var()
std_effect_sizes = pd.DataFrame(
columns=[
'Model',
'Coeff',
"Cohen's d"
]
)
for index, model in enumerate(["ENet", "Lag 1", "MLP", "Median"], 1):
var_2 = residuals_df[residuals_df['Model'] == model]['Residuals'].var()
beta = result.params[index]
cd = compute_cd_from_regression(
beta=beta,
var_1=var_1,
var_2=var_2,
n_1=n,
n_2=n
)
std_effect_sizes.loc[index - 1] = [
model,
beta,
cd
]
summary_post_hoc = result.t_test(contrasts).summary()
results_as_html = summary_post_hoc.as_html()
summary_post_hoc = pd.read_html(results_as_html, header=0, index_col=0)[0]
summary_post_hoc = summary_post_hoc.reset_index()
summary_post_hoc = summary_post_hoc.rename(
{
'index': 'Contrast'
},
axis=1
)
summary_post_hoc['Contrast'] = summary_post_hoc['Contrast'].map(
{
'c0': 'Lag 1 - Median',
'c1': 'Median - ENet',
'c2': 'ENet - MLP'
}
)
print('Regression Results')
display(result.summary())
print('Standardized Effect Size')
display(std_effect_sizes)
print('Post Hoc')
display(summary_post_hoc)
In order to attenuate the impact of extreme results we use a Student-T distribution for the model's likelyhood. Differently from the the analysis carried out with statsmodels (see above) we do not use a treatment coding of the indipendent variable but a simple coding with reference level given by the grand-mean of the dependent variable.
model = LMMPerformance(
df=final_results,
models_column='Model',
contexts_column='context',
time_column='Session',
fold_column='fold_n',
targets_column='target',
outcomes_column='value',
robust=True
)
model.analyze(
targets=final_results['target'].unique(),
approx=True,
n=100000
)
fig = plt.figure(constrained_layout=True, figsize=(16, 8))
spec = fig.add_gridspec(2, 8)
axs_metrics = [
fig.add_subplot(spec[0, 0:2]),
fig.add_subplot(spec[0, 2:4]),
fig.add_subplot(spec[0, 4:6]),
fig.add_subplot(spec[1, 1:3]),
fig.add_subplot(spec[1, 3:5])
]
ax_size = fig.add_subplot(spec[0, 6:8])
ax_time = fig.add_subplot(spec[1, 6:8])
########################################################################
index=0
for target, ax_metric in zip(targets, axs_metrics):
sns.boxplot(
x='Model',
y='value',
data=final_results[final_results['target'] == target],
order=['Lag 1', 'Median', 'ENet', 'MLP', 'RNN'],
ax=ax_metric,
showfliers=True
)
y_limits = ax_metric.get_ylim()
ax_metric.set_title(target)
if index == 0:
ax_metric.set_ylabel('% Error')
ax_metric.set_xlabel('')
ax_metric.set_xticks([])
elif index == 3:
ax_metric.set_ylabel('% Error')
elif index in [1, 2]:
ax_metric.set_ylabel('')
ax_metric.set_xlabel('')
ax_metric.set_xticks([])
else:
ax_metric.set_ylabel('')
ax_metric.margins(y=0.20)
index+=1
sns.barplot(
x='Model',
y='parameters',
data=final_results,
order=['Lag 1', 'Median', 'ENet', 'MLP', 'RNN'],
ax=ax_size
)
ax_size.set_xlabel('')
ax_size.set_xticks([])
ax_size.set_yscale('log')
ax_size.set_title('Model Size')
ax_size.set_ylabel('$log_{10}$(N° of Parameters)')
sns.barplot(
x='Model',
y='time_epochs',
data=final_results,
order=['Lag 1', 'Median', 'ENet', 'MLP', 'RNN'],
ax=ax_time
)
ax_time.set_xlabel('Model')
ax_time.set_yscale('log')
ax_time.set_title('Convergence Time / Epochs')
ax_time.set_ylabel('$log_{10}$(Seconds)')
plt.savefig(
'results\\figures\\models_performance\\models_comparison.png',
dpi=500,
bbox_inches='tight'
)